Ames Housing data - Exploratory Data Analysis (EDA)

In [2]:
# Bibliotecas
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
In [3]:
# Garantindo que todas colunas serão mostradas
pd.set_option('display.max_columns', None)
# importando os dado de treino
range1 = [i for i in range(2,82)]
usecols = range1
ames_train = pd.read_csv ('ames_train.csv', sep=';', usecols=usecols)

Data cleaning

In [4]:
# price column as the first column
ames_train = ames_train[['price','area','MS.SubClass','MS.Zoning','Lot.Frontage','Lot.Area','Street','Alley','Lot.Shape','Land.Contour','Utilities','Lot.Config','Land.Slope','Neighborhood','Condition.1','Condition.2','Bldg.Type','House.Style','Overall.Qual','Overall.Cond','Year.Built','Year.Remod.Add','Roof.Style','Roof.Matl','Exterior.1st','Exterior.2nd','Mas.Vnr.Type','Mas.Vnr.Area','Exter.Qual','Exter.Cond','Foundation','Bsmt.Qual','Bsmt.Cond','Bsmt.Exposure','BsmtFin.Type.1','BsmtFin.SF.1','BsmtFin.Type.2','BsmtFin.SF.2','Bsmt.Unf.SF','Total.Bsmt.SF','Heating','Heating.QC','Central.Air','Electrical','X1st.Flr.SF','X2nd.Flr.SF','Low.Qual.Fin.SF','Bsmt.Full.Bath','Bsmt.Half.Bath','Full.Bath','Half.Bath','Bedroom.AbvGr','Kitchen.AbvGr','Kitchen.Qual','TotRms.AbvGrd','Functional','Fireplaces','Fireplace.Qu','Garage.Type','Garage.Yr.Blt','Garage.Finish','Garage.Cars','Garage.Area','Garage.Qual','Garage.Cond','Paved.Drive','Wood.Deck.SF','Open.Porch.SF','Enclosed.Porch','X3Ssn.Porch','Screen.Porch','Pool.Area','Pool.QC','Fence','Misc.Feature','Misc.Val','Mo.Sold','Yr.Sold','Sale.Type','Sale.Condition']]
In [5]:
ames_train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 80 columns):
price              1000 non-null int64
area               1000 non-null int64
MS.SubClass        1000 non-null int64
MS.Zoning          1000 non-null object
Lot.Frontage       833 non-null float64
Lot.Area           1000 non-null int64
Street             1000 non-null object
Alley              67 non-null object
Lot.Shape          1000 non-null object
Land.Contour       1000 non-null object
Utilities          1000 non-null object
Lot.Config         1000 non-null object
Land.Slope         1000 non-null object
Neighborhood       1000 non-null object
Condition.1        1000 non-null object
Condition.2        1000 non-null object
Bldg.Type          1000 non-null object
House.Style        1000 non-null object
Overall.Qual       1000 non-null int64
Overall.Cond       1000 non-null int64
Year.Built         1000 non-null int64
Year.Remod.Add     1000 non-null int64
Roof.Style         1000 non-null object
Roof.Matl          1000 non-null object
Exterior.1st       1000 non-null object
Exterior.2nd       1000 non-null object
Mas.Vnr.Type       993 non-null object
Mas.Vnr.Area       993 non-null float64
Exter.Qual         1000 non-null object
Exter.Cond         1000 non-null object
Foundation         1000 non-null object
Bsmt.Qual          978 non-null object
Bsmt.Cond          978 non-null object
Bsmt.Exposure      977 non-null object
BsmtFin.Type.1     978 non-null object
BsmtFin.SF.1       999 non-null float64
BsmtFin.Type.2     978 non-null object
BsmtFin.SF.2       999 non-null float64
Bsmt.Unf.SF        999 non-null float64
Total.Bsmt.SF      999 non-null float64
Heating            1000 non-null object
Heating.QC         1000 non-null object
Central.Air        1000 non-null object
Electrical         1000 non-null object
X1st.Flr.SF        1000 non-null int64
X2nd.Flr.SF        1000 non-null int64
Low.Qual.Fin.SF    1000 non-null int64
Bsmt.Full.Bath     999 non-null float64
Bsmt.Half.Bath     999 non-null float64
Full.Bath          1000 non-null int64
Half.Bath          1000 non-null int64
Bedroom.AbvGr      1000 non-null int64
Kitchen.AbvGr      1000 non-null int64
Kitchen.Qual       1000 non-null object
TotRms.AbvGrd      1000 non-null int64
Functional         1000 non-null object
Fireplaces         1000 non-null int64
Fireplace.Qu       509 non-null object
Garage.Type        954 non-null object
Garage.Yr.Blt      952 non-null float64
Garage.Finish      952 non-null object
Garage.Cars        999 non-null float64
Garage.Area        999 non-null float64
Garage.Qual        952 non-null object
Garage.Cond        952 non-null object
Paved.Drive        1000 non-null object
Wood.Deck.SF       1000 non-null int64
Open.Porch.SF      1000 non-null int64
Enclosed.Porch     1000 non-null int64
X3Ssn.Porch        1000 non-null int64
Screen.Porch       1000 non-null int64
Pool.Area          1000 non-null int64
Pool.QC            3 non-null object
Fence              202 non-null object
Misc.Feature       29 non-null object
Misc.Val           1000 non-null int64
Mo.Sold            1000 non-null int64
Yr.Sold            1000 non-null int64
Sale.Type          1000 non-null object
Sale.Condition     1000 non-null object
dtypes: float64(11), int64(26), object(43)
memory usage: 625.1+ KB
In [6]:
# Evaluating numeric data
ames_numeric = ames_train.select_dtypes(include=[np.number])
ames_numeric.dtypes
ames_numeric.describe()
Out[6]:
price area MS.SubClass Lot.Frontage Lot.Area Overall.Qual Overall.Cond Year.Built Year.Remod.Add Mas.Vnr.Area BsmtFin.SF.1 BsmtFin.SF.2 Bsmt.Unf.SF Total.Bsmt.SF X1st.Flr.SF X2nd.Flr.SF Low.Qual.Fin.SF Bsmt.Full.Bath Bsmt.Half.Bath Full.Bath Half.Bath Bedroom.AbvGr Kitchen.AbvGr TotRms.AbvGrd Fireplaces Garage.Yr.Blt Garage.Cars Garage.Area Wood.Deck.SF Open.Porch.SF Enclosed.Porch X3Ssn.Porch Screen.Porch Pool.Area Misc.Val Mo.Sold Yr.Sold
count 1000.000000 1000.000000 1000.000000 833.000000 1000.000000 1000.000000 1000.00000 1000.000000 1000.000000 993.000000 999.000000 999.000000 999.000000 999.000000 1000.00000 1000.000000 1000.000000 999.000000 999.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 952.000000 999.000000 999.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 181190.076000 1476.615000 57.150000 69.214886 10352.406000 6.095000 5.55900 1972.203000 1984.338000 104.126888 464.145145 48.067067 546.988989 1059.201201 1157.09000 315.205000 4.320000 0.447447 0.061061 1.541000 0.378000 2.806000 1.039000 6.340000 0.597000 1977.919118 1.766767 475.410410 93.843000 48.931000 23.479000 3.118000 14.772000 1.463000 45.812000 6.243000 2007.766000
std 81909.787799 505.174189 43.020673 24.285327 9827.841077 1.412788 1.08982 29.637412 20.557489 184.414232 453.546342 167.031859 428.078260 426.330132 385.46496 424.281616 48.893826 0.530617 0.247786 0.544624 0.499365 0.833702 0.198793 1.575081 0.654996 25.274534 0.764084 218.776026 123.324691 69.586828 60.590567 29.216874 53.963915 30.477623 560.287897 2.752004 1.303435
min 12789.000000 334.000000 20.000000 21.000000 1470.000000 1.000000 1.00000 1872.000000 1950.000000 0.000000 0.000000 0.000000 0.000000 0.000000 334.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.000000 0.000000 1900.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 2006.000000
25% 129762.500000 1092.000000 20.000000 57.000000 7314.000000 5.000000 5.00000 1955.000000 1966.000000 0.000000 0.000000 0.000000 223.500000 797.500000 876.25000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 2.000000 1.000000 5.000000 0.000000 1961.000000 1.000000 312.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 4.000000 2007.000000
50% 159467.000000 1411.000000 50.000000 69.000000 9317.000000 6.000000 5.00000 1975.000000 1992.500000 0.000000 400.000000 0.000000 461.000000 998.000000 1080.50000 0.000000 0.000000 0.000000 0.000000 2.000000 0.000000 3.000000 1.000000 6.000000 1.000000 1979.000000 2.000000 480.000000 0.000000 28.000000 0.000000 0.000000 0.000000 0.000000 0.000000 6.000000 2008.000000
75% 213000.000000 1743.250000 70.000000 80.000000 11649.750000 7.000000 6.00000 2001.000000 2004.000000 160.000000 773.000000 0.000000 783.000000 1301.000000 1376.25000 688.250000 0.000000 1.000000 0.000000 2.000000 1.000000 3.000000 1.000000 7.000000 1.000000 2002.000000 2.000000 576.000000 168.000000 74.000000 0.000000 0.000000 0.000000 0.000000 0.000000 8.000000 2009.000000
max 615000.000000 4676.000000 190.000000 313.000000 215245.000000 10.000000 9.00000 2010.000000 2010.000000 1290.000000 2260.000000 1526.000000 2336.000000 3138.000000 3138.00000 1836.000000 1064.000000 3.000000 2.000000 4.000000 2.000000 6.000000 2.000000 13.000000 4.000000 2010.000000 5.000000 1390.000000 857.000000 742.000000 432.000000 508.000000 440.000000 800.000000 15500.000000 12.000000 2010.000000

Some variables have null values. Next, we will analyze them and decide the best way to impute data.

In [7]:
print(ames_numeric.isnull().sum().to_string())
price                0
area                 0
MS.SubClass          0
Lot.Frontage       167
Lot.Area             0
Overall.Qual         0
Overall.Cond         0
Year.Built           0
Year.Remod.Add       0
Mas.Vnr.Area         7
BsmtFin.SF.1         1
BsmtFin.SF.2         1
Bsmt.Unf.SF          1
Total.Bsmt.SF        1
X1st.Flr.SF          0
X2nd.Flr.SF          0
Low.Qual.Fin.SF      0
Bsmt.Full.Bath       1
Bsmt.Half.Bath       1
Full.Bath            0
Half.Bath            0
Bedroom.AbvGr        0
Kitchen.AbvGr        0
TotRms.AbvGrd        0
Fireplaces           0
Garage.Yr.Blt       48
Garage.Cars          1
Garage.Area          1
Wood.Deck.SF         0
Open.Porch.SF        0
Enclosed.Porch       0
X3Ssn.Porch          0
Screen.Porch         0
Pool.Area            0
Misc.Val             0
Mo.Sold              0
Yr.Sold              0
In [8]:
ames_train['Lot.Frontage'].describe()
Out[8]:
count    833.000000
mean      69.214886
std       24.285327
min       21.000000
25%       57.000000
50%       69.000000
75%       80.000000
max      313.000000
Name: Lot.Frontage, dtype: float64
In [9]:
ames_train['Mas.Vnr.Area'].describe()
Out[9]:
count     993.000000
mean      104.126888
std       184.414232
min         0.000000
25%         0.000000
50%         0.000000
75%       160.000000
max      1290.000000
Name: Mas.Vnr.Area, dtype: float64
In [10]:
ames_train['BsmtFin.SF.1'].describe()
Out[10]:
count     999.000000
mean      464.145145
std       453.546342
min         0.000000
25%         0.000000
50%       400.000000
75%       773.000000
max      2260.000000
Name: BsmtFin.SF.1, dtype: float64
In [11]:
ames_train['Bsmt.Full.Bath'].value_counts()
Out[11]:
0.0    568
1.0    416
2.0     14
3.0      1
Name: Bsmt.Full.Bath, dtype: int64
In [12]:
ames_train['Full.Bath'].value_counts()
Out[12]:
2    508
1    472
3     15
0      3
4      2
Name: Full.Bath, dtype: int64

Ops, 3 houses with 0 bathroons? 🤔

Thats's odd

In [13]:
ames_train['Garage.Yr.Blt'].value_counts()
Out[13]:
2005.0    49
2006.0    41
2007.0    40
2004.0    35
1977.0    29
          ..
1942.0     1
1927.0     1
1935.0     1
1908.0     1
1929.0     1
Name: Garage.Yr.Blt, Length: 93, dtype: int64
In [14]:
ames_train['Sale.Condition'].value_counts()
Out[14]:
Normal     834
Partial     82
Abnorml     61
Family      17
Alloca       4
AdjLand      2
Name: Sale.Condition, dtype: int64

Taking care of missing values.

For continuous variables I will use the mean and for discrete variables, the mode.

In [15]:
ames_train['Lot.Frontage'] = ames_train['Lot.Frontage'].fillna(ames_train['Lot.Frontage'].mean())
ames_train['Mas.Vnr.Area'] = ames_train['Mas.Vnr.Area'].fillna(ames_train['Mas.Vnr.Area'].mean())
ames_train['BsmtFin.SF.1'] = ames_train['BsmtFin.SF.1'].fillna(ames_train['BsmtFin.SF.1'].mean())
ames_train['BsmtFin.SF.2'] = ames_train['BsmtFin.SF.2'].fillna(ames_train['BsmtFin.SF.2'].mean())
ames_train['Bsmt.Unf.SF'] = ames_train['Bsmt.Unf.SF'].fillna(ames_train['Bsmt.Unf.SF'].mean())
ames_train['Total.Bsmt.SF'] = ames_train['Total.Bsmt.SF'].fillna(ames_train['Total.Bsmt.SF'].mean())
ames_train['Bsmt.Full.Bath'] = ames_train['Bsmt.Full.Bath'].fillna(0)
ames_train['Bsmt.Half.Bath'] = ames_train['Bsmt.Half.Bath'].fillna(0)
ames_train['Garage.Yr.Blt'] = ames_train['Garage.Yr.Blt'].fillna(0)
ames_train['Garage.Area'] = ames_train['Garage.Area'].fillna(0)
ames_train['Garage.Cars'] = ames_train['Garage.Cars'].fillna(0)

print(ames_train.isnull().sum().to_string())
price                0
area                 0
MS.SubClass          0
MS.Zoning            0
Lot.Frontage         0
Lot.Area             0
Street               0
Alley              933
Lot.Shape            0
Land.Contour         0
Utilities            0
Lot.Config           0
Land.Slope           0
Neighborhood         0
Condition.1          0
Condition.2          0
Bldg.Type            0
House.Style          0
Overall.Qual         0
Overall.Cond         0
Year.Built           0
Year.Remod.Add       0
Roof.Style           0
Roof.Matl            0
Exterior.1st         0
Exterior.2nd         0
Mas.Vnr.Type         7
Mas.Vnr.Area         0
Exter.Qual           0
Exter.Cond           0
Foundation           0
Bsmt.Qual           22
Bsmt.Cond           22
Bsmt.Exposure       23
BsmtFin.Type.1      22
BsmtFin.SF.1         0
BsmtFin.Type.2      22
BsmtFin.SF.2         0
Bsmt.Unf.SF          0
Total.Bsmt.SF        0
Heating              0
Heating.QC           0
Central.Air          0
Electrical           0
X1st.Flr.SF          0
X2nd.Flr.SF          0
Low.Qual.Fin.SF      0
Bsmt.Full.Bath       0
Bsmt.Half.Bath       0
Full.Bath            0
Half.Bath            0
Bedroom.AbvGr        0
Kitchen.AbvGr        0
Kitchen.Qual         0
TotRms.AbvGrd        0
Functional           0
Fireplaces           0
Fireplace.Qu       491
Garage.Type         46
Garage.Yr.Blt        0
Garage.Finish       48
Garage.Cars          0
Garage.Area          0
Garage.Qual         48
Garage.Cond         48
Paved.Drive          0
Wood.Deck.SF         0
Open.Porch.SF        0
Enclosed.Porch       0
X3Ssn.Porch          0
Screen.Porch         0
Pool.Area            0
Pool.QC            997
Fence              798
Misc.Feature       971
Misc.Val             0
Mo.Sold              0
Yr.Sold              0
Sale.Type            0
Sale.Condition       0

checking categorical data

In [16]:
ames_categorical = ames_train.select_dtypes(exclude=[np.number])
ames_categorical.describe()
Out[16]:
MS.Zoning Street Alley Lot.Shape Land.Contour Utilities Lot.Config Land.Slope Neighborhood Condition.1 Condition.2 Bldg.Type House.Style Roof.Style Roof.Matl Exterior.1st Exterior.2nd Mas.Vnr.Type Exter.Qual Exter.Cond Foundation Bsmt.Qual Bsmt.Cond Bsmt.Exposure BsmtFin.Type.1 BsmtFin.Type.2 Heating Heating.QC Central.Air Electrical Kitchen.Qual Functional Fireplace.Qu Garage.Type Garage.Finish Garage.Qual Garage.Cond Paved.Drive Pool.QC Fence Misc.Feature Sale.Type Sale.Condition
count 1000 1000 67 1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 993 1000 1000 1000 978 978 977 978 978 1000 1000 1000 1000 1000 1000 509 954 952 952 952 1000 3 202 29 1000 1000
unique 6 2 2 4 4 1 5 3 27 9 6 5 7 5 5 12 13 4 4 4 5 5 5 4 6 6 5 5 2 4 5 7 5 6 3 5 5 3 3 4 4 10 6
top RL Pave Pave Reg Lvl AllPub Inside Gtl NAmes Norm Norm 1Fam 1Story Gable CompShg VinylSd VinylSd None TA TA PConc TA TA No GLQ Unf GasA Ex Y SBrkr TA Typ Gd Attchd Unf TA TA Y Ex MnPrv Shed WD Normal
freq 772 997 34 629 909 1000 710 962 155 875 988 823 521 775 984 349 345 593 613 861 453 438 908 635 294 863 988 516 945 932 509 935 232 610 427 904 918 904 1 120 25 863 834
In [17]:
print(ames_categorical.isnull().sum().to_string())
MS.Zoning           0
Street              0
Alley             933
Lot.Shape           0
Land.Contour        0
Utilities           0
Lot.Config          0
Land.Slope          0
Neighborhood        0
Condition.1         0
Condition.2         0
Bldg.Type           0
House.Style         0
Roof.Style          0
Roof.Matl           0
Exterior.1st        0
Exterior.2nd        0
Mas.Vnr.Type        7
Exter.Qual          0
Exter.Cond          0
Foundation          0
Bsmt.Qual          22
Bsmt.Cond          22
Bsmt.Exposure      23
BsmtFin.Type.1     22
BsmtFin.Type.2     22
Heating             0
Heating.QC          0
Central.Air         0
Electrical          0
Kitchen.Qual        0
Functional          0
Fireplace.Qu      491
Garage.Type        46
Garage.Finish      48
Garage.Qual        48
Garage.Cond        48
Paved.Drive         0
Pool.QC           997
Fence             798
Misc.Feature      971
Sale.Type           0
Sale.Condition      0

For categorical variables I will assume that the lack of data indicates that that variable does not apply to the property.

In [18]:
ames_train['Alley'] = ames_train['Alley'].replace({np.nan: 'No Alley'})
ames_train['Mas.Vnr.Type'] = ames_train['Mas.Vnr.Type'].replace({np.nan: 'Mas.Vnr.Type'})
ames_train['Bsmt.Qual'] = ames_train['Bsmt.Qual'].replace({np.nan: 'No Bsmt'})
ames_train['Bsmt.Cond'] = ames_train['Bsmt.Cond'].replace({np.nan: 'No Bsmt'})
ames_train['Bsmt.Exposure'] = ames_train['Bsmt.Exposure'].replace({np.nan: 'No Bsmt'})
ames_train['BsmtFin.Type.1'] = ames_train['BsmtFin.Type.1'].replace({np.nan: 'No Bsmt'})
ames_train['BsmtFin.Type.2'] = ames_train['BsmtFin.Type.2'].replace({np.nan: 'No Bsmt'})
ames_train['Garage.Type'] = ames_train['Garage.Type'].replace({np.nan: 'No Garage'})
ames_train['Garage.Finish'] = ames_train['Garage.Finish'].replace({np.nan: 'No Garage'})
ames_train['Garage.Qual'] = ames_train['Garage.Qual'].replace({np.nan: 'No Garage'})
ames_train['Garage.Cond'] = ames_train['Garage.Cond'].replace({np.nan: 'No Garage'})
ames_train['Pool.QC'] = ames_train['Pool.QC'].replace({np.nan: 'No Pool'})
ames_train['Fence'] = ames_train['Fence'].replace({np.nan: 'No Fence'})
ames_train['Misc.Feature'] = ames_train['Misc.Feature'].replace({np.nan: 'No Misc.Feature'})
ames_train['Fireplace.Qu'] = ames_train['Fireplace.Qu'].replace({np.nan: 'No Fireplace'})

print(ames_train.isnull().sum().to_string())
price              0
area               0
MS.SubClass        0
MS.Zoning          0
Lot.Frontage       0
Lot.Area           0
Street             0
Alley              0
Lot.Shape          0
Land.Contour       0
Utilities          0
Lot.Config         0
Land.Slope         0
Neighborhood       0
Condition.1        0
Condition.2        0
Bldg.Type          0
House.Style        0
Overall.Qual       0
Overall.Cond       0
Year.Built         0
Year.Remod.Add     0
Roof.Style         0
Roof.Matl          0
Exterior.1st       0
Exterior.2nd       0
Mas.Vnr.Type       0
Mas.Vnr.Area       0
Exter.Qual         0
Exter.Cond         0
Foundation         0
Bsmt.Qual          0
Bsmt.Cond          0
Bsmt.Exposure      0
BsmtFin.Type.1     0
BsmtFin.SF.1       0
BsmtFin.Type.2     0
BsmtFin.SF.2       0
Bsmt.Unf.SF        0
Total.Bsmt.SF      0
Heating            0
Heating.QC         0
Central.Air        0
Electrical         0
X1st.Flr.SF        0
X2nd.Flr.SF        0
Low.Qual.Fin.SF    0
Bsmt.Full.Bath     0
Bsmt.Half.Bath     0
Full.Bath          0
Half.Bath          0
Bedroom.AbvGr      0
Kitchen.AbvGr      0
Kitchen.Qual       0
TotRms.AbvGrd      0
Functional         0
Fireplaces         0
Fireplace.Qu       0
Garage.Type        0
Garage.Yr.Blt      0
Garage.Finish      0
Garage.Cars        0
Garage.Area        0
Garage.Qual        0
Garage.Cond        0
Paved.Drive        0
Wood.Deck.SF       0
Open.Porch.SF      0
Enclosed.Porch     0
X3Ssn.Porch        0
Screen.Porch       0
Pool.Area          0
Pool.QC            0
Fence              0
Misc.Feature       0
Misc.Val           0
Mo.Sold            0
Yr.Sold            0
Sale.Type          0
Sale.Condition     0

EDA

From here I can already begin to assess the relationships between the variables and make inferences about them.

In [19]:
# Agora, com os dados limpos, vamos avaliar as correlações entre as variáveis numéricas
corr = ames_numeric.corr()

f, ax = plt.subplots(figsize=(36, 18))
# Escondendo os dados da parte de 'cima' da diagonal principal
mask = np.triu(np.ones_like(corr, dtype=bool))

# setando a uma paleta de cores divergentes para o mapa de calor
cmap = sns.diverging_palette(230, 20, as_cmap=True)
# Este é o único gráfico que não consigo fazer com plotly /=
sns.heatmap(corr, annot=True, mask = mask, cmap=cmap)
Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x219b01434c8>

Some variables stand out for their high correlation with price. Area, as expected, is one of them and, looking closer, we see that there are several variables that make up the area of ​​the property, later on they will be unified in order to simplify the data structure and avoid multicollinearities. Overall build quality has a very high correlation. It is a discrete variable and can even be considered a category. Year of construction is also a variable expected to be important in the composition of prices, since older houses tend to have a lower value per area due to depreciation, style and construction lau-out. The garage, whether it exists or not, will have a big impact as well. Although this variable falls under the area class, described above, I will deal with it separately in the analysis.

In [20]:
ames_clean = ames_train[['price', 'area','X1st.Flr.SF','X2nd.Flr.SF','Low.Qual.Fin.SF', 'Total.Bsmt.SF', 'Year.Built', 'Overall.Qual','Neighborhood','Bldg.Type','Garage.Area','Garage.Type']]
ames_clean['Total.Area'] = ames_train['area']+ames_train['Total.Bsmt.SF']
ames_clean['PricePerSF'] = ames_clean['price']/ames_clean['Total.Area']
ames_clean = ames_clean[['price', 'Total.Area','PricePerSF', 'Year.Built','Garage.Area','Overall.Qual','Neighborhood','Bldg.Type','Garage.Type']]
ames_clean.head()
C:\Users\Yuri\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
C:\Users\Yuri\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
Out[20]:
price Total.Area PricePerSF Year.Built Garage.Area Overall.Qual Neighborhood Bldg.Type Garage.Type
0 126000 1712.0 73.598131 1939 399.0 6 SWISU 1Fam Detchd
1 139500 2098.0 66.491897 1984 266.0 5 Edwards TwnhsE Attchd
2 124900 1838.0 67.954298 1930 216.0 5 IDOTRR 1Fam Detchd
3 114000 1444.0 78.947368 1900 281.0 4 OldTown 1Fam Detchd
4 227000 2475.0 91.717172 2001 528.0 8 NWAmes 1Fam Attchd

Price behavior

As a matter of fact, the first curiosity I had was whether property prices showed significant variations in the analyzed period. And, as can be seen in the chart below, it did not. In fact, what is observed is great stability in property prices, with their medians very close. The occurrence becomes especially curious if we remember that the observed period comprises the 2008 crisis that began precisely in the real estate sector. A factor that possibly partly explains such stability is that the city of Ames has practically 2/5 of its population employed at Iowa State University or government agencies, which suggests greater stability in per capita income. This information, however, would need to be further analyzed and is currently beyond the scope of this report.

In [21]:
df = px.data.tips()
fig = px.box(ames_train, x= 'Yr.Sold', y= "price",  color = 'Yr.Sold')
fig.show()

Prices distribution

We have a slightly skewed distribution to the left, suggesting that most of the trades were in a price range that goes up to around $150,000.

In [22]:
fig = px.histogram(ames_clean, x="price", nbins=50)
fig.show()

When we look at price per area (per square foot), we have a more normal distribution, with the median close to $70 per square foot.

In [23]:
fig = px.histogram(ames_clean, x=ames_clean['PricePerSF'], nbins=30)
fig.show()

Evaluating the variables with the highest correlation (note that the area variable was created from the sum of the built areas) the result of the graphs gives us a view that was already expected a priori, but that allows us to observe the existence, or not, of outliers.

In [24]:
fig = make_subplots(rows=2, cols=2,
    subplot_titles=('Area', 'Quality', 'Construction Year', 'Garage size'))

fig.add_trace(
    go.Scatter(x = ames_clean['Total.Area'], y = ames_clean['price'], opacity=0.65,
        mode="markers"
    ),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x = ames_clean['Overall.Qual'], y = ames_clean['price'], opacity=0.65,
        mode="markers"
    ),
    row=1, col=2
)

fig.add_trace(
    go.Scatter(x = ames_clean['Year.Built'], y = ames_clean['price'], opacity=0.65,
        mode="markers"
    ),
    row=2, col=1
)


fig.add_trace(
    go.Scatter(x = ames_clean['Garage.Area'], y = ames_clean['price'], opacity=0.65,
        mode="markers"
    ),
    row=2, col=2
)

fig.update_layout(height=800, width=800, title_text="Price relationships with explanatory variables")

fig.show()

Neighborhood

Certainly, when we think about moving, one of the first things that comes to mind is the neighborhood or region of the property. Below we can see that this has a considerable impact on the selling price. While we have neighborhoods where you can't get anything for less than 170 thousand, in others, not even the most expensive properties reach this price.

In [25]:
df = px.data.tips()
fig = px.box(ames_clean, x= 'Neighborhood', y="price")
fig.show()

Looking at the average price per square foot in each neighborhood, it is possible to observe that the GrnHill neighborhood has the highest average price, which is also due to the small number of properties sold in the region.

In [26]:
ames_Neighborhood = (ames_clean.groupby('Neighborhood', as_index=True)['PricePerSF'].mean().sort_values(ascending= True))
#ames_Neighborhood['Neighborhood'] = ames_train['Neighborhood']
fig = px.bar(ames_Neighborhood, y= 'PricePerSF')
fig.show()

Garages

As the garage area showed a strong correlation with price, I decided to analyze whether the type of garage also exerts any relevant influence on property pricing. In general, the type of garage seems to positively influence the price when it is built into the house, probably because it offers more comfort and protection against the weather. On the other hand, as expected, the non-existence of a garage tends to be associated with properties sold at a lower price.

In [27]:
df = px.data.tips()
fig = px.box(ames_clean, x= 'Garage.Type', y= "price")
fig.show()

Conclusion

A good location is just as important as a spacious home or superior construction. This is clear in regions such as Stone Broken Road (StoneBr in the data set), which is close to parks, or Greensboro (Greens), which is close to the University. After controlling the location, factors such as size and type of garage, year of construction and quality of work are also of great importance.

Predicting house prices

In [28]:
# Importando os dois datasets e fazendo um merge a fim de fazer uma tratamento único nos dados
range1 = [i for i in range(2,82)]
#usecols = range1
amesTrain = pd.read_csv ('ames_train.csv', sep=';')
amesTest = pd.read_csv ('ames_test.csv', sep=';')
print(amesTrain.shape)
print(amesTest.shape)
(1000, 82)
(817, 82)
In [29]:
amesTrain.head()
Out[29]:
Unnamed: 0 PID area price MS.SubClass MS.Zoning Lot.Frontage Lot.Area Street Alley Lot.Shape Land.Contour Utilities Lot.Config Land.Slope Neighborhood Condition.1 Condition.2 Bldg.Type House.Style Overall.Qual Overall.Cond Year.Built Year.Remod.Add Roof.Style Roof.Matl Exterior.1st Exterior.2nd Mas.Vnr.Type Mas.Vnr.Area Exter.Qual Exter.Cond Foundation Bsmt.Qual Bsmt.Cond Bsmt.Exposure BsmtFin.Type.1 BsmtFin.SF.1 BsmtFin.Type.2 BsmtFin.SF.2 Bsmt.Unf.SF Total.Bsmt.SF Heating Heating.QC Central.Air Electrical X1st.Flr.SF X2nd.Flr.SF Low.Qual.Fin.SF Bsmt.Full.Bath Bsmt.Half.Bath Full.Bath Half.Bath Bedroom.AbvGr Kitchen.AbvGr Kitchen.Qual TotRms.AbvGrd Functional Fireplaces Fireplace.Qu Garage.Type Garage.Yr.Blt Garage.Finish Garage.Cars Garage.Area Garage.Qual Garage.Cond Paved.Drive Wood.Deck.SF Open.Porch.SF Enclosed.Porch X3Ssn.Porch Screen.Porch Pool.Area Pool.QC Fence Misc.Feature Misc.Val Mo.Sold Yr.Sold Sale.Type Sale.Condition
0 1 40000 856 126000 30 RL NaN 7890 Pave NaN Reg Lvl AllPub Corner Gtl SWISU Norm Norm 1Fam 1Story 6 6 1939 1950 Gable CompShg Wd Sdng Wd Sdng None 0.0 TA TA CBlock TA TA No Rec 238.0 Unf 0.0 618.0 856.0 GasA TA Y SBrkr 856 0 0 1.0 0.0 1 0 2 1 TA 4 Typ 1 Gd Detchd 1939.0 Unf 2.0 399.0 TA TA Y 0 0 0 0 166 0 NaN NaN NaN 0 3 2010 WD Normal
1 2 40001 1049 139500 120 RL 42.0 4235 Pave NaN Reg Lvl AllPub Inside Gtl Edwards Norm Norm TwnhsE 1Story 5 5 1984 1984 Gable CompShg HdBoard HdBoard BrkFace 149.0 Gd TA CBlock Gd TA Mn GLQ 552.0 ALQ 393.0 104.0 1049.0 GasA TA Y SBrkr 1049 0 0 1.0 0.0 2 0 2 1 Gd 5 Typ 0 NaN Attchd 1984.0 Fin 1.0 266.0 TA TA Y 0 105 0 0 0 0 NaN NaN NaN 0 2 2009 WD Normal
2 3 40002 1001 124900 30 C (all) 60.0 6060 Pave NaN Reg Lvl AllPub Inside Gtl IDOTRR Norm Norm 1Fam 1Story 5 9 1930 2007 Hip CompShg MetalSd MetalSd None 0.0 Gd TA BrkTil TA TA No ALQ 737.0 Unf 0.0 100.0 837.0 GasA Ex Y SBrkr 1001 0 0 0.0 0.0 1 0 2 1 Gd 5 Typ 0 NaN Detchd 1930.0 Unf 1.0 216.0 TA Po N 154 0 42 86 0 0 NaN NaN NaN 0 11 2007 WD Normal
3 4 40003 1039 114000 70 RL 80.0 8146 Pave NaN Reg Lvl AllPub Corner Gtl OldTown Norm Norm 1Fam 2Story 4 8 1900 2003 Gable CompShg MetalSd MetalSd None 0.0 Gd Gd BrkTil Fa TA No Unf 0.0 Unf 0.0 405.0 405.0 GasA Gd Y SBrkr 717 322 0 0.0 0.0 1 0 2 1 TA 6 Typ 0 NaN Detchd 1940.0 Unf 1.0 281.0 TA TA N 0 0 168 0 111 0 NaN NaN NaN 0 5 2009 WD Normal
4 5 40004 1665 227000 60 RL 70.0 8400 Pave NaN Reg Lvl AllPub Inside Gtl NWAmes Norm Norm 1Fam 2Story 8 6 2001 2001 Gable CompShg VinylSd VinylSd None 0.0 Gd TA PConc Gd TA No GLQ 643.0 Unf 0.0 167.0 810.0 GasA Ex Y SBrkr 810 855 0 1.0 0.0 2 1 3 1 Gd 6 Typ 0 NaN Attchd 2001.0 Fin 2.0 528.0 TA TA Y 0 45 0 0 0 0 NaN NaN NaN 0 11 2009 WD Normal
In [30]:
# Separando os ids de cada data set para posteriormente poder separar os dados para treino e teste
#amesTest['price'] = np.nan
ames_df = pd.concat([amesTrain, amesTest], axis=0)
trainPID = amesTrain['PID']
testPID = amesTest['PID']
In [31]:
ames_df = ames_df[['PID','price', 'area','X1st.Flr.SF','X2nd.Flr.SF','Low.Qual.Fin.SF', 'Total.Bsmt.SF', 'Year.Built', 'Overall.Qual','Full.Bath','TotRms.AbvGrd','Fireplaces','Neighborhood','Bldg.Type','Garage.Area','Garage.Type']]
In [32]:
# Tratando valores faltantes com o mesmo critério utilizado na EDA
ames_df['Total.Bsmt.SF'] = ames_df['Total.Bsmt.SF'].fillna(ames_df['Total.Bsmt.SF'].mean())
ames_df['Garage.Area'] = ames_df['Garage.Area'].fillna(0)
ames_df['Garage.Type'] = ames_df['Garage.Type'].replace({np.nan: 'No Garage'})
In [33]:
# Criando as variáveis de área total e preço por sf conforme feiot na EDA
ames_df['Total.Area'] = ames_df['area']+ames_df['Total.Bsmt.SF']
ames_df['PricePerSF'] = ames_df['price']/ames_df['Total.Area']
# Utilizarei na regressão um conjunto de dados menor que a totalidade pois temos muitas variáveis sem correlação
# e muitas variáveis categóricas relacionadas entre si. 
# A excessão é a variável Neighborhood que demonstrou grande impacto na análise prévia
ames_df = ames_df[['PID','price', 'Total.Area','PricePerSF','Garage.Area', 'Year.Built', 'Overall.Qual','Full.Bath','TotRms.AbvGrd','Fireplaces', 'Neighborhood']]
ames_df.head()
Out[33]:
PID price Total.Area PricePerSF Garage.Area Year.Built Overall.Qual Full.Bath TotRms.AbvGrd Fireplaces Neighborhood
0 40000 126000 1712.0 73.598131 399.0 1939 6 1 4 1 SWISU
1 40001 139500 2098.0 66.491897 266.0 1984 5 2 5 0 Edwards
2 40002 124900 1838.0 67.954298 216.0 1930 5 1 5 0 IDOTRR
3 40003 114000 1444.0 78.947368 281.0 1900 4 1 6 0 OldTown
4 40004 227000 2475.0 91.717172 528.0 2001 8 2 6 0 NWAmes
In [34]:
#### encodando a variável categórica em númerica a fim de poder encaixá-la no modelo
ames_df[['Neighborhood']] = ames_df[['Neighborhood']].apply(pd.Categorical)
#ames_df.loc[ames_df['PID'] == testPID['PID'], 'price'] = np.nan
#ames_df.loc[ames_df['PID'] == testPID['PID']
#ames_df['price'] = np.where((ames_df.['PID'] == testPID['PID']),np.nan, ames_df.price, inplace=True)
#ames_df.loc[ames_df['PID'].isin(testPID), 'price'] = np.nan
ames_df["Neighborhood"] = ames_df["Neighborhood"].cat.codes
ames_df.head()
Out[34]:
PID price Total.Area PricePerSF Garage.Area Year.Built Overall.Qual Full.Bath TotRms.AbvGrd Fireplaces Neighborhood
0 40000 126000 1712.0 73.598131 399.0 1939 6 1 4 1 21
1 40001 139500 2098.0 66.491897 266.0 1984 5 2 5 0 7
2 40002 124900 1838.0 67.954298 216.0 1930 5 1 5 0 11
3 40003 114000 1444.0 78.947368 281.0 1900 4 1 6 0 20
4 40004 227000 2475.0 91.717172 528.0 2001 8 2 6 0 17
In [35]:
ames_df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1817 entries, 0 to 816
Data columns (total 11 columns):
PID              1817 non-null int64
price            1817 non-null int64
Total.Area       1817 non-null float64
PricePerSF       1817 non-null float64
Garage.Area      1817 non-null float64
Year.Built       1817 non-null int64
Overall.Qual     1817 non-null int64
Full.Bath        1817 non-null int64
TotRms.AbvGrd    1817 non-null int64
Fireplaces       1817 non-null int64
Neighborhood     1817 non-null int8
dtypes: float64(3), int64(7), int8(1)
memory usage: 157.9 KB
In [36]:
# Separando entre os conjuntos de variáveis explicativas e variável resposta
#,'Blmngtn','Blueste','BrDale','BrkSide','ClearCr','CollgCr','Crawfor','Edwards','Gilbert','Greens','GrnHill','IDOTRR','Landmrk',
#'MeadowV','Mitchel','NAmes'  ,'NPkVill','NWAmes' ,'NoRidge','NridgHt','OldTown','SWISU','Sawyer' ,'SawyerW','Somerst','StoneBr','Timber','Veenker'
X = ames_df[['PID','Total.Area', 'Year.Built', 'Overall.Qual','Garage.Area','Full.Bath','Fireplaces','TotRms.AbvGrd','PricePerSF','Neighborhood']]
y = ames_df[['PID','price']]

# utilizando os ids fornecidos para filtrar os dados de treino e de teste
X_train = X.loc[X['PID'].isin(trainPID)]
y_train = y.loc[y['PID'].isin(trainPID)]

X_test = X.loc[X['PID'].isin(testPID)]
y_test = y.loc[y['PID'].isin(testPID)]
In [37]:
# removendo os ids para não impactarem na regressão 

X_train.drop('PID', axis='columns', inplace=True)
y_train.drop('PID', axis='columns', inplace=True)
X_test.drop('PID', axis='columns', inplace=True)
y_test.drop('PID', axis='columns', inplace=True)
C:\Users\Yuri\Anaconda3\lib\site-packages\pandas\core\frame.py:4102: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [38]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()

# fazendo a regressão nos dados de treino
regressor.fit(X_train, y_train)

# Previsão dos preços nos dados de teste
y_pred = regressor.predict(X_test)

# Verificando a performance do modelo
from sklearn.metrics import r2_score, mean_squared_error

# R2
print(f"R2 score: {r2_score(y_test, y_pred)}")

# Média dos quadrados dos erros
print(f"MSE score: {mean_squared_error(y_test, y_pred)}")
R2 score: 0.9643198441482181
MSE score: 188812377.39265993

Conclusion

The R² shows us that the model seems to have a high explanatory capacity, 0.96, but the mean of the squares is very high, which indicates a high deviation between the predicted values ​​and the values ​​of the given data.